Set up the R computing environment

#set up chart theme function and load required packages
chart.theme.size = 14

source("../r_config/r_env_setup.R") #load packages and cutom ggplot theme function and presets
path_data = "../data/ISLR_data/"

library(MASS)
#install.packages("ISLR")
library(ISLR)

R Basics

vectors, data, matrices, subsetting

# create a numberical vector
x=c(2,7,5)
x
## [1] 2 7 5
# create a vector from sequence from, number of elments and increment by
y=seq(from=4, length=3,by=3)
y
## [1]  4  7 10
# create a vector from sequence from to and increment by
y=seq(from=4, to=13, by=3)
y
## [1]  4  7 10 13
x+y
## [1]  6 14 15 15
x/y
## [1] 0.5000000 1.0000000 0.5000000 0.1538462
x^y
## [1]      16  823543 9765625    8192
x[2] # print second element of vector x - starts from 1
## [1] 7
x[2:3]
## [1] 7 5
x[-2] # remove the element 2 from the vector and return the rest
## [1] 2 5
x[-c(1,2)] # remove elements 1 and 2 from the vector
## [1] 5
# 12 element matrix with 4 rows and 3 col
z=matrix(seq(1,12),4,3) 
z
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
# rows 3 to 4 and col 2 to 3
z[3:4,2:3]
##      [,1] [,2]
## [1,]    7   11
## [2,]    8   12
z[,2:3] # all rows and columns 2, 3
##      [,1] [,2]
## [1,]    5    9
## [2,]    6   10
## [3,]    7   11
## [4,]    8   12
z[,1] # all rows and column 1
## [1] 1 2 3 4
z[,1,drop=FALSE] # retain the matrix
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]    4
dim(z) # no. of rows and col
## [1] 4 3
ls() # list of data objects
##  [1] "chart_footer"                   "chart_footnote"                
##  [3] "chart_footnote_save"            "chart_format_bar_x_axis_rotate"
##  [5] "chart_format_bar1"              "chart_format_bar2"             
##  [7] "chart_format_default"           "chart_format_hist_no_minor"    
##  [9] "chart_format_hist_no_vgrid"     "chart_format_hist1"            
## [11] "chart_theme_bar"                "chart_theme_default"           
## [13] "chart_theme_histogram"          "chart_theme_minimal"           
## [15] "chart_theme_scatter"            "chart_theme01"                 
## [17] "chart.line.size"                "chart.theme.size"              
## [19] "color_highlight_pal"            "color_primary_pal"             
## [21] "geom_bar_custom"                "legend_size_override"          
## [23] "legend_title_hide"              "legend_top"                    
## [25] "package"                        "packages"                      
## [27] "path_data"                      "text.size"                     
## [29] "x"                              "y"                             
## [31] "z"
rm(y) # remove vector y
ls()
##  [1] "chart_footer"                   "chart_footnote"                
##  [3] "chart_footnote_save"            "chart_format_bar_x_axis_rotate"
##  [5] "chart_format_bar1"              "chart_format_bar2"             
##  [7] "chart_format_default"           "chart_format_hist_no_minor"    
##  [9] "chart_format_hist_no_vgrid"     "chart_format_hist1"            
## [11] "chart_theme_bar"                "chart_theme_default"           
## [13] "chart_theme_histogram"          "chart_theme_minimal"           
## [15] "chart_theme_scatter"            "chart_theme01"                 
## [17] "chart.line.size"                "chart.theme.size"              
## [19] "color_highlight_pal"            "color_primary_pal"             
## [21] "geom_bar_custom"                "legend_size_override"          
## [23] "legend_title_hide"              "legend_top"                    
## [25] "package"                        "packages"                      
## [27] "path_data"                      "text.size"                     
## [29] "x"                              "z"

Generating random data, graphics

x=runif(50)
x
##  [1] 0.59865236 0.07734090 0.50563056 0.24184711 0.29518013 0.99841695
##  [7] 0.36579037 0.34105074 0.31082351 0.42464203 0.57105918 0.86676923
## [13] 0.11483153 0.10723313 0.98432896 0.03290910 0.84335147 0.24569949
## [19] 0.24117104 0.56326188 0.38073335 0.57331388 0.77516890 0.06589140
## [25] 0.64475563 0.66179147 0.54256958 0.73781172 0.72120992 0.24783744
## [31] 0.55754446 0.65712876 0.91594592 0.61441602 0.89631957 0.33404201
## [37] 0.95006025 0.30279009 0.78508569 0.01851865 0.48507190 0.29634177
## [43] 0.47050787 0.15363834 0.21350483 0.79741760 0.08071203 0.18318078
## [49] 0.93075617 0.92878729
xd = data.frame(x)
qplot(data=xd, x, geom="histogram", binwidth=0.1) + chart_theme_default

y=rnorm(50)
yd = data.frame(x)
qplot(data=yd, y, geom="histogram", binwidth=0.1)

# combine x and y vectors to a dataframe
data = data.frame(x, y)

#scatterplot x and y
ggplot(data, aes(x=x, y=y)) + chart_theme_default +
  geom_point(size = 6, color= color_primary_pal[2], alpha = .5) +
  xlab("Random Uniform") +
  ylab("Random Normal")

plot(x,y,xlab="Random Uniform",ylab="Random Normal",pch="*",col="blue")

#panel with two charts stacked
#par(mfrow=c(2,1))
plot(x,y)

hist(y)

#panel with two charts side by side
#par(mfrow=c(1,2))
hist(y)

hist(x)

### Reading in data

Auto = read_csv(file = paste(path_data, "Auto.csv", sep =""))
problems(Auto)
## Source: local data frame [139 x 4]
## 
##    row col               expected actual
## 1  188   1 no trailing characters     .5
## 2  190   1 no trailing characters     .5
## 3  191   1 no trailing characters     .5
## 4  195   1 no trailing characters     .5
## 5  197   1 no trailing characters     .5
## 6  202   1 no trailing characters     .5
## 7  203   1 no trailing characters     .5
## 8  204   1 no trailing characters     .5
## 9  207   1 no trailing characters     .5
## 10 212   1 no trailing characters     .5
## .. ... ...                    ...    ...
names(Auto)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"  
## [5] "weight"       "acceleration" "year"         "origin"      
## [9] "name"
dim(Auto)
## [1] 397   9
class(Auto)
## [1] "tbl_df"     "tbl"        "data.frame"
summary(Auto)
##       mpg          cylinders      displacement    horsepower       
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Length:397        
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:104.0   Class :character  
##  Median :23.00   Median :4.000   Median :146.0   Mode  :character  
##  Mean   :23.35   Mean   :5.458   Mean   :193.5                     
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0                     
##  Max.   :46.00   Max.   :8.000   Max.   :455.0                     
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2223   1st Qu.:13.80   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2800   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2970   Mean   :15.56   Mean   :75.99   Mean   :1.574  
##  3rd Qu.:3609   3rd Qu.:17.10   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##      name          
##  Length:397        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
plot(Auto$cylinders,Auto$mpg)

plot(Auto$cyl,Auto$mpg)

attach(Auto)
search()
##  [1] ".GlobalEnv"           "Auto"                 "package:ISLR"        
##  [4] "package:MASS"         "package:caret"        "package:lattice"     
##  [7] "package:GGally"       "package:energy"       "package:xtable"      
## [10] "package:htmlTable"    "package:knitr"        "package:RColorBrewer"
## [13] "package:tidyr"        "package:data.table"   "package:readr"       
## [16] "package:dplyr"        "package:gridExtra"    "package:grid"        
## [19] "package:stringr"      "package:lubridate"    "package:scales"      
## [22] "package:ggthemes"     "package:ggplot2"      "package:stats"       
## [25] "package:graphics"     "package:grDevices"    "package:utils"       
## [28] "package:datasets"     "package:methods"      "Autoloads"           
## [31] "package:base"
plot(cylinders,mpg)

cylinders=as.factor(cylinders)
plot(cylinders,mpg,xlab="Cylinders",ylab="Mpg",col="red")

pairs(mpg~cylinders+acceleration+weight,Auto)

LINEAR REGRESSION

Simple Linear Regression

#Linear regression
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7
#?Boston
plot(medv~lstat,Boston)

fit1=lm(medv~lstat,data=Boston)
fit1
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Coefficients:
## (Intercept)        lstat  
##       34.55        -0.95
summary(fit1)
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
abline(fit1,col="red")

names(fit1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
fit1$coefficients
## (Intercept)       lstat 
##  34.5538409  -0.9500494
coefficients(fit1)
## (Intercept)       lstat 
##  34.5538409  -0.9500494
confint(fit1)
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505
#predict outcomes (median value) for 3 data points
predict(fit1,data.frame(lstat=c(5,10,15)),interval="confidence")
##        fit      lwr      upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
#Using caret
fit1_caret <- train(medv~lstat, data = Boston, method = "lm")
fit1_caret
## Linear Regression 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## 
## Resampling results
## 
##   RMSE      Rsquared   RMSE SD   Rsquared SD
##   6.293631  0.5397662  0.382599  0.03151111 
## 
## 
names(fit1_caret)
##  [1] "method"       "modelInfo"    "modelType"    "results"     
##  [5] "pred"         "bestTune"     "call"         "dots"        
##  [9] "metric"       "control"      "finalModel"   "preProcess"  
## [13] "trainingData" "resample"     "resampledCM"  "perfNames"   
## [17] "maximize"     "yLimits"      "times"        "terms"       
## [21] "coefnames"    "xlevels"
names(fit1_caret$finalModel)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"        
## [13] "xNames"        "problemType"   "tuneValue"     "obsLevels"
summary(fit1_caret)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
fit1_caret$finalModel$coefficients
## (Intercept)       lstat 
##  34.5538409  -0.9500494
#score the model on the training data 
pred1_caret = predict(fit1_caret$finalModel, newdata = Boston)

#predict outcomes (median value) for 3 data points along with confidence intervals
predict(fit1_caret$finalModel,data.frame(lstat=c(5,10,15)),interval="confidence", level = .95)
##        fit      lwr      upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
summary(pred1_caret)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.52   18.45   23.76   22.53   27.95   32.91
#coefficients for the model
coefficients(fit1_caret$finalModel)
## (Intercept)       lstat 
##  34.5538409  -0.9500494
#confidence interval for the model
confint(fit1_caret$finalModel, level = .95)
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505
#fitted values from the model object
summary(fit1_caret$finalModel$fitted.values)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.52   18.45   23.76   22.53   27.95   32.91
summary(Boston$medv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.02   21.20   22.53   25.00   50.00
#plot(fit1_caret$finalModel)

#plot residuals and fitted values
ggplot(data = Boston, aes(x = lstat, y = medv)) + chart_theme_default +
  geom_point(color = color_primary_pal[1], size = 2, alpha = 0.7) +
  geom_point(aes(x = lstat, y = fit1_caret$finalModel$residuals, color = "Residuals"),
             size = 2, alpha = 0.7) +
  geom_hline(yintercept = 0, color = color_primary_pal[1]) +
  geom_line(aes(x = lstat, y = fit1_caret$finalModel$fitted.values)) +
  legend_top + legend_size_override(5) + legend_title_hide

#plot actual vs fitted values
ggplot(data = Boston, aes(y = medv, x = pred1_caret)) + chart_theme_default +
  geom_point(color = color_primary_pal[1], size = 2, alpha = 0.7) 

#Library for high quality model visualization
#http://www.strengejacke.de/sjPlot/sjp.lm/
#https://susanejohnston.wordpress.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/
  
library(sjPlot)
# load sample data
data(Auto)
# set plot theme
sjp.setTheme(theme = "539")
# plot frequencies
sjp.frq(Auto$year)

sjp.lm(fit1, type = "pred")

par(mfrow=c(2,2))
#plot(fit1)
plot(fit1_caret$finalModel)

#from the curve in the residuals we can see that the model is not quite capturing the non-linearity

Multiple Linear Regression

#Linear model median value as a function of lstat and age
fit2=lm(medv~lstat+age,data=Boston)
fit2=train(medv~lstat+age, method = "lm", data=Boston)

summary(fit2)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
fit3=train(medv~., method = "lm", data=Boston)
summary(fit3)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit3$finalModel)

fit4=train(medv~., method = "lm", data=dplyr::select(Boston, -c(age, indus)))


summary(fit4)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5984  -2.7386  -0.5046   1.7273  26.2373 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
## crim         -0.108413   0.032779  -3.307 0.001010 ** 
## zn            0.045845   0.013523   3.390 0.000754 ***
## chas          2.718716   0.854240   3.183 0.001551 ** 
## nox         -17.376023   3.535243  -4.915 1.21e-06 ***
## rm            3.801579   0.406316   9.356  < 2e-16 ***
## dis          -1.492711   0.185731  -8.037 6.84e-15 ***
## rad           0.299608   0.063402   4.726 3.00e-06 ***
## tax          -0.011778   0.003372  -3.493 0.000521 ***
## ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
## black         0.009291   0.002674   3.475 0.000557 ***
## lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7348 
## F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16

Non Linear Terms & Interactions

#interaction between lstat and age
fit5=lm(medv~lstat*age,Boston)
summary(fit5)
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16
fit6=lm(medv~lstat +I(lstat^2),Boston); summary(fit6)
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.862007   0.872084   49.15   <2e-16 ***
## lstat       -2.332821   0.123803  -18.84   <2e-16 ***
## I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
attach(Boston)
par(mfrow=c(1,1))
plot(medv~lstat)
points(lstat,fitted(fit6),col="red",pch=20)
fit7=lm(medv~poly(lstat,4))
points(lstat,fitted(fit7),col="blue",pch=20)

plot(1:20,1:20,pch=1:20,cex=2)

Qualitative Predictors

#Qualitative predictors

names(Carseats)
##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"
summary(Carseats)
##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age       
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00  
##    Education    Urban       US     
##  Min.   :10.0   No :118   No :142  
##  1st Qu.:12.0   Yes:282   Yes:258  
##  Median :14.0                      
##  Mean   :13.9                      
##  3rd Qu.:16.0                      
##  Max.   :18.0
fit1=lm(Sales~.+Income*Advertising+Age*Price,Carseats)
summary(fit1)
## 
## Call:
## lm(formula = Sales ~ . + Income * Advertising + Age * Price, 
##     data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9208 -0.7503  0.0177  0.6754  3.3413 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
## CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
## Income              0.0108940  0.0026044   4.183 3.57e-05 ***
## Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
## Population          0.0001592  0.0003679   0.433 0.665330    
## Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
## ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
## ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
## Age                -0.0579466  0.0159506  -3.633 0.000318 ***
## Education          -0.0208525  0.0196131  -1.063 0.288361    
## UrbanYes            0.1401597  0.1124019   1.247 0.213171    
## USYes              -0.1575571  0.1489234  -1.058 0.290729    
## Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
## Price:Age           0.0001068  0.0001333   0.801 0.423812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared:  0.8761, Adjusted R-squared:  0.8719 
## F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16
contrasts(Carseats$ShelveLoc)
##        Good Medium
## Bad       0      0
## Good      1      0
## Medium    0      1

R functions …

##Writing R functions
regplot=function(x,y){
  fit=lm(y~x)
  plot(x,y)
  abline(fit,col="red")
}
attach(Carseats)
regplot(Price,Sales)

#the ... arguments get passed as it is in the function
regplot=function(x,y,...){
  fit=lm(y~x)
  plot(x,y,...)
  abline(fit,col="red")
}
regplot(Price,Sales,xlab="Price",ylab="Sales",col="blue",pch=20)